hzip15 <- read_csv('./dash-austin-crime-report-2015/2014_Housing_Market_Analysis_Data_by_Zip_Code.csv',
col_types = cols(`Zip Code` = col_character()))
dat <- read_csv('./dash-austin-crime-report-2015/Crime_Housing_Joined.csv',
col_types = cols(Zip_Code_Crime = col_character(),
Zip_Code_Housing = col_character(),
Clearance_Date = col_date(format = '%d-%b-%y')))
We shall refer to Crime_Housing_Joined.csv as the crime data set, and the 2014_Housing_Market_Analysis_Data_by_Zip_Code.csv as the housing data set. The crime data set contains data regarding specfic crimes that occured. For example, if the the type of crime, the zip code where the crime occurred and the lattitude and longitude of where the crime occurred. The variables generally have long names that roughly describe the variable. Details around some crimes are ommitted for some crimes due to the nature of those crime. For example, the coordinates are not provided for rapes. The housing data set provides housing and demographic information about thirty six zip codes within Austin, Texas. The names of the variables in both data sets are given with the code below. Using functions such as or created too much output for the report.
cat("Dimension of crime data set is", dim(dat), '\n')
## Dimension of crime data set is 38573 43
print("Crime Data set Variables")
## [1] "Crime Data set Variables"
print(names(dat))
## [1] "Key"
## [2] "Council_District"
## [3] "Highest_Offense_Desc"
## [4] "Highest_NIBRS_UCR_Offense_Description"
## [5] "Report_Date"
## [6] "Location"
## [7] "Clearance_Status"
## [8] "Clearance_Date"
## [9] "District"
## [10] "Zip_Code_Crime"
## [11] "Census_Tract"
## [12] "X_Coordinate"
## [13] "Y_Coordinate"
## [14] "Zip_Code_Housing"
## [15] "Populationbelowpovertylevel"
## [16] "Medianhouseholdincome"
## [17] "Non-WhiteNon-HispanicorLatino"
## [18] "HispanicorLatinoofanyrace"
## [19] "Populationwithdisability"
## [20] "Unemployment"
## [21] "Largehouseholds(5+members)"
## [22] "Homesaffordabletopeopleearninglessthan$50000"
## [23] "Rentalsaffordabletopeopleearninglessthan$25000"
## [24] "Rent-restrictedunits"
## [25] "HousingChoiceVoucherholders"
## [26] "Medianrent"
## [27] "Medianhomevalue"
## [28] "Percentageofrentalunitsinpoorcondition"
## [29] "Percentchangeinnumberofhousingunits2000-2012"
## [30] "Ownerunitsaffordabletoaverageretail/serviceworker"
## [31] "Rentalunitsaffordabletoaverageretail/serviceworker"
## [32] "Rentalunitsaffordabletoaverageartist"
## [33] "Ownerunitsaffordabletoaverageartist"
## [34] "Rentalunitsaffordabletoaverageteacher"
## [35] "Ownerunitsaffordabletoaverageteacher"
## [36] "Rentalunitsaffordabletoaveragetechworker"
## [37] "Ownerunitsaffordabletoaveragetechworker"
## [38] "Changeinpercentageofpopulationbelowpoverty2000-2012"
## [39] "Changeinmedianrent2000-2012"
## [40] "Changeinmedianhomevalue2000-2012"
## [41] "Percentageofhomeswithin1/4-mioftransitstop"
## [42] "Averagemonthlytransportationcost"
## [43] "Percentageofhousingandtransportationcoststhatistransportation-related"
cat("Dimension of housing data set is", dim(hzip15), '\n')
## Dimension of housing data set is 37 30
print("Housing Data set Variables")
## [1] "Housing Data set Variables"
print(names(hzip15))
## [1] "Zip Code"
## [2] "Population below poverty level"
## [3] "Median household income"
## [4] "Non-White, Non-Hispanic or Latino"
## [5] "Hispanic or Latino, of any race"
## [6] "Population with disability"
## [7] "Unemployment"
## [8] "Large households (5+ members)"
## [9] "Homes affordable to people earning less than $50,000"
## [10] "Rentals affordable to people earning less than $25,000"
## [11] "Rent-restricted units"
## [12] "Housing Choice Voucher holders"
## [13] "Median rent"
## [14] "Median home value"
## [15] "Percentage of rental units in poor condition"
## [16] "Percent change in number of housing units, 2000-2012"
## [17] "Owner units affordable to average retail/service worker"
## [18] "Rental units affordable to average retail/service worker"
## [19] "Rental units affordable to average artist"
## [20] "Owner units affordable to average artist"
## [21] "Rental units affordable to average teacher"
## [22] "Owner units affordable to average teacher"
## [23] "Rental units affordable to average tech worker"
## [24] "Owner units affordable to average tech worker"
## [25] "Change in percentage of population below poverty, 2000-2012"
## [26] "Change in median rent, 2000-2012"
## [27] "Change in median home value, 2000-2012"
## [28] "Percentage of homes within 1/4-mi of transit stop"
## [29] "Average monthly transportation cost"
## [30] "Percentage of housing and transportation costs that is transportation-related"
There are are two problems with the data sets. First, we have to map the coordinates provided in the crime data set to the coordinate reference system provided by the ggmap package. We’ll do that in the following code. Secondly, much of the data is numeric but in formats that caused the data to read in as character. This could have been avoided by reading the in the data with parser in the readr package. However, since since there dozens of such variables in both data sets, I chose to write my own function to clean the data, then apply the function columnwise using the function.
The following code uses the sp and rgdal packages to transform the coordinate reference system.
data.frame(lon = dat$X_Coordinate, lat = dat$Y_Coordinate) %>%
filter(complete.cases(.)) -> df_coords
coordinates(df_coords) <- c("lon", "lat")
proj4string(df_coords) <- CRS("+init=esri:102739")
CRS.new <- CRS("+init=epsg:4326")
df_coords_trans <- spTransform(df_coords, CRS.new)
dat %>%
filter(!(is.na(X_Coordinate) | is.na(Y_Coordinate))) %>%
mutate(X_Coordinate = df_coords_trans@coords[ ,1],
Y_Coordinate = df_coords_trans@coords[ ,2]) %>%
select(Key, X_Coordinate, Y_Coordinate) -> dat_coords
dat %>%
select(-X_Coordinate, -Y_Coordinate) %>%
full_join(dat_coords, by = "Key") -> dat
The following code converts data to numeric by removing the symbols “%”“,”$“”, and “,”.
dat %>%
select(-Key, -Council_District, -Highest_Offense_Desc,
-Highest_NIBRS_UCR_Offense_Description, -Report_Date, -Location,
-Clearance_Status, -Clearance_Date, -District, -Zip_Code_Crime,
-X_Coordinate, -Y_Coordinate, -Zip_Code_Housing) -> dat_num
my_sub <- function(x){
x <- gsub("%", "", x)
x <- gsub("\\$" ,"", x)
x <- gsub(",", "", x)
x <- as.numeric(x)
}
dat_mat <- apply(dat_num, 2, my_sub)
df_num <- tbl_df(dat_mat)
var_ix_2drop <- which(names(dat) %in% colnames(dat_mat))
dat %>%
select(-var_ix_2drop) %>%
bind_cols(df_num) -> dat
Here, we apply the same method to the housing data set.
hzip_mat <- apply(hzip15, 2, my_sub)
hzip15 <- tbl_df(hzip_mat)
hzip15 %>%
mutate(`Zip Code` = as.character(`Zip Code`))
## # A tibble: 37 x 30
## `Zip Code` `Population below poverty level` `Median household income`
## <chr> <dbl> <dbl>
## 1 78726 9 66096
## 2 <NA> 19 52431
## 3 78724 38 35711
## 4 78617 18 43957
## 5 78701 20 68152
## 6 78702 33 34734
## 7 78703 10 92606
## 8 78704 21 50248
## 9 78705 66 11917
## 10 78717 3 93305
## # ... with 27 more rows, and 27 more variables: `Non-White, Non-Hispanic
## # or Latino` <dbl>, `Hispanic or Latino, of any race` <dbl>, `Population
## # with disability` <dbl>, Unemployment <dbl>, `Large households (5+
## # members)` <dbl>, `Homes affordable to people earning less than
## # $50,000` <dbl>, `Rentals affordable to people earning less than
## # $25,000` <dbl>, `Rent-restricted units` <dbl>, `Housing Choice Voucher
## # holders` <dbl>, `Median rent` <dbl>, `Median home value` <dbl>,
## # `Percentage of rental units in poor condition` <dbl>, `Percent change
## # in number of housing units, 2000-2012` <dbl>, `Owner units affordable
## # to average retail/service worker` <dbl>, `Rental units affordable to
## # average retail/service worker` <dbl>, `Rental units affordable to
## # average artist` <dbl>, `Owner units affordable to average
## # artist` <dbl>, `Rental units affordable to average teacher` <dbl>,
## # `Owner units affordable to average teacher` <dbl>, `Rental units
## # affordable to average tech worker` <dbl>, `Owner units affordable to
## # average tech worker` <dbl>, `Change in percentage of population below
## # poverty, 2000-2012` <dbl>, `Change in median rent, 2000-2012` <dbl>,
## # `Change in median home value, 2000-2012` <dbl>, `Percentage of homes
## # within 1/4-mi of transit stop` <dbl>, `Average monthly transportation
## # cost` <dbl>, `Percentage of housing and transportation costs that is
## # transportation-related` <dbl>
Now, let’s visualize the coordinates where each crime was commited for the crimes where the coordinates are provided.
dat %>%
filter(!(is.na(X_Coordinate) | is.na(Y_Coordinate))) -> df_no_missing
df_no_missing %>%
ggplot(aes(X_Coordinate, Y_Coordinate))+
geom_point(aes(X_Coordinate, Y_Coordinate), alpha = 0.2, shape = 1)+
geom_density2d()+
labs(x = "Longitude", y = "Lattitude", title = 'All Crime in Austin, TX')+
theme_grey() -> plt1
plt1
We can use the ggmap package to pull a map of Austin, Texas and overlay these points.
amap11 <- get_map(location = 'Austin, Texas', zoom = 11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=Austin,+Texas&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=Austin,%20Texas&sensor=false
Now, we can overlay the points in the previous plot with an actual map of Austin, Texas.
ggmap(amap11)+
geom_point(aes(X_Coordinate, Y_Coordinate), data = df_no_missing,
alpha = 0.2, shape = 1)+
geom_density2d(aes(X_Coordinate, Y_Coordinate), data = df_no_missing)+
labs(x = "Longitude", y = "Lattitude", title = 'All Crime in Austin, TX')
## Warning: Removed 896 rows containing non-finite values (stat_density2d).
## Warning: Removed 896 rows containing missing values (geom_point).
As shown in the warning message about 891 observations were ommited because of the zoom. I did try to overlay this with a zoom that would include all the observations, but this was not very insightful.
We can visualize the different types of crimes using the function with the variable Highest_NIBRS_UCR_Offense_Description.
ggmap(amap11)+
geom_point(aes(X_Coordinate, Y_Coordinate), data = df_no_missing,
alpha = 0.2, shape = 1)+
geom_density2d(aes(X_Coordinate, Y_Coordinate), data = df_no_missing)+
facet_wrap(~Highest_NIBRS_UCR_Offense_Description)+
labs(x = "Longitude", y = "Lattitude", title = 'Crime in Austin, TX')
## Warning: Removed 896 rows containing non-finite values (stat_density2d).
## Warning: Removed 896 rows containing missing values (geom_point).
dat %>%
select(Clearance_Date) %>%
filter(complete.cases(.))%>%
group_by(Clearance_Date) %>%
summarise(Count = n()) %>%
arrange(Clearance_Date) %>%
ggplot(aes(Clearance_Date, Count))+
geom_line()+
labs(x='Clearance Date', y='Number of Crimes', title='Number of Crimes over Time')+
theme_grey()
There seems to be a large downward trend after January 2016. Also, there seems be be very a cyclical pattern, perhaps this is due to the weekday?
dat %>%
select(Clearance_Date) %>%
filter(complete.cases(.))%>%
mutate(Weekday = lubridate::wday(Clearance_Date, label = TRUE)) %>%
group_by(Weekday) %>%
summarise(Count = n()) %>%
ggplot(aes(Weekday, Count))+
geom_bar(stat = 'identity', width = 0.5)+
labs(y='Number of Crimes', title = 'Number of Crimes by Weekday')+
theme_grey()
There is considerably less crime on the weekend. Perhaps this may be due to data collection or perhaps crime occurrs during the weekdays for strategic purposes.
dat %>%
select(Clearance_Date) %>%
filter(complete.cases(.) & Clearance_Date < as.Date('010116', format='%m%d%y'))%>%
mutate(Month = lubridate::month(Clearance_Date, label = TRUE)) %>%
group_by(Month) %>%
summarise(Count = n()) %>%
ggplot(aes(Month, Count))+
labs(y='Number of Crime by Month', title = 'Number of Crimes by Month')+
geom_bar(stat = 'identity', width = 0.5)+
theme_grey()
First, there is a diparity between the zip codes in the data sets. All the zip codes in the crime data set (Crime_Housing_Joined.csv) are found in the zip code data set (2014_Housing_Market_Analysis_Data_by_Zip_Code.csv). However, not all the the zip codes found in the zip code data set are found the crime data set. This is shown in the following code.
d_zips <- unique(dat$Zip_Code_Crime)
h_zips <- unique(hzip15$`Zip Code`)
all(h_zips %in% d_zips) #All match here but
## [1] TRUE
all(d_zips %in% h_zips) #but not here
## [1] FALSE
not_fnd <- d_zips[which(!d_zips %in% h_zips)]
cat('The Zip codes', not_fnd, 'are not found in the housing data set')
## The Zip codes 78719 78653 78747 78613 78660 78736 78725 78652 78733 78737 78712 are not found in the housing data set
dat %>%
rename(`Zip Code` = Zip_Code_Crime) %>%
filter(`Zip Code` %in% hzip15$`Zip Code`) %>%
group_by(`Zip Code`) %>%
summarise(num_crimes = n()) %>%
ungroup() %>%
full_join(hzip15 %>%
mutate(`Zip Code` = as.character(`Zip Code`)), by = "Zip Code") %>%
filter(!is.na(`Zip Code`)) -> hdat
hdat %>%
ggplot(aes(reorder(`Zip Code`, num_crimes), num_crimes))+
geom_bar(stat = 'identity', width = 0.5)+
theme_grey()+
coord_flip()+
labs(y='Zip Code', x='Number of Crimes', title = 'Number of Crimes by Zip Code')
hdat %>%
select(-`Zip Code`) %>%
as.matrix() -> hmat
hmat_cor <- cor(hmat, use = 'pairwise.complete')
sort(hmat_cor[,'num_crimes'], decreasing = TRUE)
## num_crimes
## 1.000000000
## Percentage of homes within 1/4-mi of transit stop
## 0.580001979
## Owner units affordable to average teacher
## 0.423820471
## Population below poverty level
## 0.411416265
## Hispanic or Latino, of any race
## 0.407429252
## Homes affordable to people earning less than $50,000
## 0.405675044
## Population with disability
## 0.335856977
## Owner units affordable to average tech worker
## 0.335722494
## Rent-restricted units
## 0.335522168
## Rental units affordable to average artist
## 0.296059290
## Unemployment
## 0.279174734
## Percentage of rental units in poor condition
## 0.273446534
## Rental units affordable to average teacher
## 0.271934904
## Owner units affordable to average artist
## 0.266701528
## Rental units affordable to average retail/service worker
## 0.264518528
## Rental units affordable to average tech worker
## 0.226189316
## Rentals affordable to people earning less than $25,000
## 0.213281649
## Change in median home value, 2000-2012
## 0.188566845
## Owner units affordable to average retail/service worker
## 0.087700618
## Change in median rent, 2000-2012
## 0.080470912
## Housing Choice Voucher holders
## 0.077561134
## Percentage of housing and transportation costs that is transportation-related
## 0.058382881
## Large households (5+ members)
## -0.003000815
## Non-White, Non-Hispanic or Latino
## -0.069126800
## Median rent
## -0.241793565
## Median home value
## -0.280257309
## Change in percentage of population below poverty, 2000-2012
## -0.335252227
## Percent change in number of housing units, 2000-2012
## -0.351134331
## Average monthly transportation cost
## -0.375293889
## Median household income
## -0.421947054
Let’s draw a correlation plot of the data to get an idea about some of the interactions in the data.
#Corplot
rownames(hmat_cor) <- NULL
colnames(hmat_cor) <- NULL
corrplot(hmat_cor)
The correlation plots shows many possible interactions between the data. This adds further caution against interpreting the raw correlations directly.
I’ld like to know where most crimes occur in Austin, Texas. To do this, I will try to identify clusters of crime by using the kmeans algorithim on the coordinates. First, we have to create a data frame that only contains the coordinates.
df_no_missing %>%
select(X_Coordinate, Y_Coordinate) -> coord_df
Next, we’ll try to identify the appropriate number of clusters by plotting the within sum of squares versus the number of clusters.
Nk <- 10
w <- numeric(Nk)
b <- numeric(Nk)
for (num_cent in 2:Nk){
fit <- kmeans(coord_df, num_cent)
w[num_cent] <- fit$tot.withinss
}
w <- w[-1]
qplot(2:Nk,w, xlab = 'Number of Clusters', ylab = "Within SS")
From this plot it appears 4 clusters will suffice.
kmeans_fit <- kmeans(coord_df, centers = 4)
cntrs <- kmeans_fit$centers
plt1 +
geom_point(aes(cntrs[1,1], cntrs[1,2], col = 'red'))+
geom_point(aes(cntrs[2,1], cntrs[2,2], col = 'red'))+
geom_point(aes(cntrs[3,1], cntrs[3,2], col = 'red'))+
geom_point(aes(cntrs[4,1], cntrs[4,2], col = 'red'))+
scale_colour_discrete(name ="Cluster", labels = NULL)
As the following code shows about 81% of crime is commited within about a 3.5 mile radius of one of these centers.
clust1 <- c(cntrs[1,1], cntrs[1,2])
clust2 <- c(cntrs[2,1], cntrs[2,2])
clust3 <- c(cntrs[3,1], cntrs[3,2])
clust4 <- c(cntrs[4,1], cntrs[4,2])
dist_clust1_miles <- numeric(nrow(coord_df))
dist_clust2_miles <- numeric(nrow(coord_df))
dist_clust3_miles <- numeric(nrow(coord_df))
dist_clust4_miles <- numeric(nrow(coord_df))
meters2miles <- 1/(1000*1.60934)
for(i in 1:nrow(coord_df)){
x <- coord_df$X_Coordinate[i]
y <- coord_df$Y_Coordinate[i]
dist_clust1_miles[i] <- distHaversine(c(x,y), clust1) * meters2miles
dist_clust2_miles[i] <- distHaversine(c(x,y), clust2) * meters2miles
dist_clust3_miles[i] <- distHaversine(c(x,y), clust3) * meters2miles
dist_clust4_miles[i] <- distHaversine(c(x,y), clust4) * meters2miles
}
dist_df <- tbl_df(data.frame(dist_clust1_miles, dist_clust2_miles,
dist_clust3_miles, dist_clust4_miles))
num_miles <- 3.5
dist_df %>%
filter(dist_clust1_miles < num_miles
| dist_clust2_miles < num_miles
| dist_clust3_miles < num_miles
| dist_clust4_miles < num_miles) %>%
nrow()/nrow(dist_df)
## [1] 0.8144335